**********
* Readme *
**********

* This script creates variables required to survival estimations.


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/2_5_build_pnad_duration_${today}.smcl", replace


***********************
* EMPLOYMENT duration *
***********************

/* Only workers who were observed at wage employment at least one time, one row
per individual, covariates measured as of last observation before transition or censoring.
Note: the strategy below assumes latest info is the correct one in conflicting cases */

* Import general data set
use "${analysis}/data/2_4_pnad_clean.dta", clear

* Sort dataset by individual and the by date. This order is crucial for using bysort and [_n] below
sort ind_id date

* Define interview rounds (useful as sanity check if need to debug)
* bysort ind_id: gen int_n = _n
* order int_n

* Remove individuals observed a single time
bysort ind_id: gen int_N = _N
drop if int_N == 1
drop int_N

* Subset the sample to individuals who were seen at risk (i.e. at employment) at least once
bysort ind_id: egen ever_employee = sum(work_state_name == "Employee")
keep if ever_employee != 0
drop ever_employee

* [how_long_e] The main object of interest, the reported duration of the spell at each date
replace how_long_ea = . if work_state_name != "Employee"
replace how_long_eb = . if work_state_name != "Employee"

* [first_obs_e] FIRST date the individual is observed at risk (one obs. per individual)
gen date_e = date if work_state_name == "Employee"
bysort ind_id: egen first_obs_e = min(date_e)
drop date_e

* [obs_window] For how long the individual has been observed at risk? i.e. follow-up length at each date
gen obs_window = date - first_obs_e if work_state_name == "Employee"

* [t0_e] The implicit elapsed duration before the observation window
gen t0_e = how_long_ea - obs_window
replace t0_e = 0 if t0_e < 0 

* [still_at_risk_e] Will we see the individual still at risk after this period?
bysort ind_id: gen still_at_risk_e = 1   if work_state_name[_n] == "Employee" & work_state_name[_n+1] == "Employee"
bysort ind_id: gen still_at_risk_e_0 = 0 if work_state_name[_n] == "Employee" & work_state_name[_n+1] != "Employee"
replace still_at_risk_e = 0 if still_at_risk_e_0 == 0
drop still_at_risk_e_0

* [last_obs_e] When do I see the FIRST state change? i.e. last known point of the first relevant duration (one obs. per individual)
gen date_e = date if still_at_risk_e == 0
bysort ind_id: egen last_obs_e = min(date_e)
drop date_e

* [ta_e] Left boundary of the interval where the failure takes place (one obs. per individual)
gen ta_e = how_long_ea if last_obs_e == date

* [fail_e] Do I observe a failure (1) or is the process right-censored (0)? (one obs. per individual)
bysort ind_id: gen fail_e = 1 if work_state_name[_n] == "Employee" & work_state_name[_n+1] != "Employee" & !missing(work_state_name[_n+1])
bysort ind_id: gen cens_e = 1 if work_state_name[_n] == "Employee" & missing(work_state_name[_n+1])
replace fail_e = 0 if cens_e == 1
drop cens_e

* [tb_e] Right boundary of the interval where the failure takes place (one obs. per individual)
bysort ind_id: gen tb_e = how_long_eb + (date[_n+1] - date[_n]) if fail_e[_n] == 1

* Keep only the data that refers to the first observed transition (one obs. per individual)
keep if date == last_obs_e

* Remove individuals observed at a risk one single time (they do not contribute to the likelihood anyway)
drop if t0_e == ta_e & missing(tb_e)
drop first_obs_e obs_window still_at_risk_e last_obs_e

* Export the data
save "${analysis}/data/2_5_pnad_clean_duration_e.dta", replace


*************************
* UNEMPLOYMENT duration *
*************************

/* Only workers who were observed at unemployment at least one time, one row
per individual, covariates measured as of last observation before transition or censoring.
Note: the strategy below assumes latest info is the correct one in conflicting cases */

 * Import general data set
use "${analysis}/data/2_4_pnad_clean.dta", clear

* Sort dataset by individual and the by date. This order is crucial for using bysort and [_n] below
sort ind_id date

* Define interview rounds (useful as sanity check if need to debug)
* bysort ind_id: gen int_n = _n
* order int_n

* Remove individuals observed a single time
bysort ind_id: gen int_N = _N
drop if int_N == 1
drop int_N

* Subset the sample to individuals who were seen at risk (i.e. at unemployment) at least once
bysort ind_id: egen ever_unemployed = sum(work_state_name == "Unemployed")
keep if ever_unemployed != 0
drop ever_unemployed

* [how_long_u] The main object of interest, the reported duration of the spell at each date
replace how_long_ua = . if work_state_name != "Unemployed"
replace how_long_ub = . if work_state_name != "Unemployed"

* [first_obs_u] FIRST date the individual is observed at risk (one obs. per individual)
gen date_u = date if work_state_name == "Unemployed"
bysort ind_id: egen first_obs_u = min(date_u)
drop date_u

* [obs_window] For how long the individual has been observed at risk? i.e. follow-up length at each date
gen obs_window = date - first_obs_u if work_state_name == "Unemployed"

* [t0_u] The implicit elapsed duration before the observation window
gen t0_u = how_long_ua - obs_window
replace t0_u = 0 if t0_u < 0 

* [still_at_risk_u] Do I see the individual still at risk after this period?
bysort ind_id: gen still_at_risk_u = 1 if work_state_name[_n] == "Unemployed" & work_state_name[_n+1] == "Unemployed"
bysort ind_id: gen still_at_risk_u_0 = 0 if work_state_name[_n] == "Unemployed" & work_state_name[_n+1] != "Unemployed"
replace still_at_risk_u = 0 if still_at_risk_u_0 == 0
drop still_at_risk_u_0

* [last_obs_u] When do I see the FIRST relevant state change? (one obs. per individual)
gen date_u = date if still_at_risk_u == 0
bysort ind_id: egen last_obs_u = min(date_u)
drop date_u

* [ta_u] Left boundary of the interval where the failure takes place (one obs. per individual)
gen ta_u = how_long_ua if last_obs_u == date

* [fail_u] Do I observe a failure (1) or is the process right-censored (0)?
bysort ind_id: gen fail_u = 1 if work_state_name[_n] == "Unemployed" & work_state_name[_n+1] == "Employee"
bysort ind_id: gen cens_u = 1 if work_state_name[_n] == "Unemployed" & work_state_name[_n+1] != "Unemployed" & work_state_name[_n+1] != "Employee"
replace fail_u = 0 if cens_u == 1
drop cens_u

* [tb_u] Right boundary of the interval where the failure takes place (one obs. per individual)
bysort ind_id: gen tb_u = how_long_ub + (date[_n+1] - date[_n]) if fail_u[_n] == 1

* Keep only the data that refers to the first observed transition (one obs. per individual)
keep if date == last_obs_u

* Remove individuals observed at a risk one single time (they do not contribute to the likelihood anyway)
drop if t0_u == ta_u & missing(tb_u)
drop first_obs_u obs_window still_at_risk_u last_obs_u

* Export the data
save "${analysis}/data/2_5_pnad_clean_duration_u.dta", replace


* End of script
***************
cap log close